{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "plt.style.use(\"ggplot\")\n",
    "\n",
    "import itertools\n",
    "import sys\n",
    "import pickle\n",
    "sys.path.append('../code')\n",
    "from classification import CV_classification, classification_onefold, bs_p_value, bs_p_value_skewness\n",
    "from lp import *\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from scipy.spatial import ConvexHull\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import StandardScaler, PolynomialFeatures"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocess Dataset\n",
    "* [data description](https://fairlearn.org/v0.10/user_guide/datasets/diabetes_hospital_data.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Features shape: (101766, 24)\n",
      "Target shape: (101766,)\n",
      "Feature names: ['race', 'gender', 'age', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'primary_diagnosis', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'insulin', 'change', 'diabetesMed', 'medicare', 'medicaid', 'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'readmitted', 'readmit_binary']\n"
     ]
    }
   ],
   "source": [
    "from fairlearn.datasets import fetch_diabetes_hospital\n",
    "\n",
    "# Load the dataset\n",
    "dataset = fetch_diabetes_hospital()\n",
    "\n",
    "# Access the features and target\n",
    "X = dataset.data  # Features\n",
    "y = dataset.target  # Target variable\n",
    "\n",
    "# Access feature names\n",
    "feature_names = dataset.feature_names\n",
    "\n",
    "# Optionally, print the shape of the features and target to understand the dataset size\n",
    "print(\"Features shape:\", X.shape)\n",
    "print(\"Target shape:\", y.shape)\n",
    "print(\"Feature names:\", feature_names)\n",
    "\n",
    "df = pd.concat([pd.DataFrame(y), pd.DataFrame(X, columns=feature_names)], axis=1)\n",
    "\n",
    "categorical_features = [\n",
    "    \"race\",\n",
    "    \"gender\",\n",
    "    \"age\",\n",
    "    \"discharge_disposition_id\",\n",
    "    \"admission_source_id\",\n",
    "    \"medical_specialty\",\n",
    "    \"primary_diagnosis\",\n",
    "    \"max_glu_serum\",\n",
    "    \"A1Cresult\",\n",
    "    \"insulin\",\n",
    "    \"change\",\n",
    "    \"diabetesMed\",\n",
    "    \"readmitted\"\n",
    "]\n",
    "\n",
    "for col_name in categorical_features:\n",
    "    df[col_name] = df[col_name].astype(\"category\")\n",
    "\n",
    "# drop gender group Unknown/Invalid\n",
    "df = df.query(\"gender != 'Unknown/Invalid'\")\n",
    "df['gender'] = df['gender'].replace({'Male': 0, 'Female': 1})\n",
    "df.rename(columns={'gender': 'female'}, inplace=True)\n",
    "df['female'] = (df['female']==1).astype('int')\n",
    "# drop some columns\n",
    "df.drop(['max_glu_serum', 'A1Cresult', 'discharge_disposition_id', 'readmitted'], axis=1, inplace=True)\n",
    "# keep only Caucasian and AfricanAmerican\n",
    "df = df[df['race'].isin(['Caucasian', 'AfricanAmerican'])]\n",
    "df['race'] = df['race'].replace({'Caucasian': 0, 'AfricanAmerican': 1})\n",
    "df.rename(columns={'race': 'black'}, inplace=True)\n",
    "df['black'] = (df['black']==1).astype('int')\n",
    "\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "df_encoded = pd.get_dummies(df, drop_first=True)\n",
    "\n",
    "Y_columns = [y for y in df_encoded.columns if 'admit' in y]\n",
    "G_columns = ['black', 'female']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=95309, step=1)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_encoded.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_encoded.to_csv('diabetes_fairlearn.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
